Proteins of
Interest
proteins_of_interest <- c(
"PDCD4",
"LGALS1",
"LGALS3",
"LGALSL", #replaced galectin by the gene name
"CCL17",
"BCCIP",
"CDC42",
"CDC37",
"CDC38",
"CCD86",
"LPXN",
"PAX",
"CYTB",
"SWI/SNF",
"NFKB1",
"TNFAIP8",
"ST13",
"NEUA",
"CYC1",
"PDCD5",
"PDCD6",
"PDCD6IP",
"SERPINB6",
"CD63",
"CD70",
"LCA",
"NUDC",
"BAG6",
"BCLAF1",
"JAK",
"STAT6",
"STAT3",
"STAT5A",
"STAT1",
"NUDT5",
"PRCP",
"SMARCC2",
"BANF1",
"BAF", # added baf because BANF1 = BAF is not possible
"MANF",
"NENF",
"DDB1",
"ARMT1",
"FEN1",
"APEX1",
"PAFAH1B1",
"EEF1E1",
"HDGF",
"GRB2",
"MYDGF",
"OGFR",
"CCAR1",
"CCAR2",
"AIFM1",
"API5",
"ACIN1",
"RBM4",
"SDHB",
"PURA",
"SDHA",
"DLST",
"SUCLG1",
"OXCT1",
"OGDH",
"SUCLG2",
"SUCLA2",
"ALDH",
"SOD2",
"MCM2",
"ACY1",
"GATM",
"HEATR",
"NUP210",
"NUP214",
"VDAC1",
"VDAC2",
"VDAC3",
"BCCP",
"HK2",
"HK1",
"PARP1",
"PARP4",
"H2AFX",
"PCYT1A",
"HSPA1B",
"CASP3",
"SLC4A7",
"EPS15L1",
"KCNAB2",
"TNFRSF8",
"CCBL2",
"GAR1"
)
Load Libraries &
Plot function & Colors
Functions
############################################################
# Function: run_deqms_pipeline
# Purpose: Fit limma model, apply DEqMS (spectraCounteBayes),
# and show the variance boxplot.
############################################################
run_deqms_pipeline <- function(protein.matrix, design, pep.count.table,
contrast_string = "classC-classE",
main_title = "Label-free dataset") {
# Step 1: Fit linear model with limma
fit1 <- limma::lmFit(protein.matrix, design = design)
# Step 2: Define contrasts
cont <- limma::makeContrasts(contrasts = contrast_string, levels = design)
fit2 <- limma::contrasts.fit(fit1, contrasts = cont)
# Step 3: Empirical Bayes shrinkage
fit3 <- limma::eBayes(fit2)
fit3$sigma[which(fit3$sigma == 0)] <- 1e-14
# Step 4: Attach peptide counts
fit3$count <- pep.count.table[rownames(fit3$coefficients), "count"]
# Validate peptide count input
if (any(is.na(fit3$count)) || min(fit3$count, na.rm = TRUE) <= 0) {
stop("Error: At least one entry in 'pep.count.table$count' is NA or <= 0. Please verify inputs.")
}
# Step 5: DEqMS modeling
fit4 <- DEqMS::spectraCounteBayes(fit3)
# Step 6: Visualization
DEqMS::VarianceBoxplot(fit4, n = 20,
main = main_title,
xlab = "peptide count + 1")
return(fit4)
}
############################################################
# Function: plot_volcano
# Purpose: Create a volcano plot using log2 fold change and q-values
############################################################
plot_volcano <- function(res, title = "", lfc_limit = NA) {
res.plot <- res %>%
dplyr::filter(!is.na(adj.P.Val)) %>%
dplyr::arrange(adj.P.Val) %>%
dplyr::rowwise() %>%
dplyr::mutate(
threshold = adj.P.Val < 0.05 & abs(logFC) > 1.0,
out_of_bounds = ifelse(is.na(lfc_limit), 0, (abs(logFC) > lfc_limit) * sign(logFC)),
logFC_capped = ifelse(out_of_bounds != 0, lfc_limit * sign(logFC), logFC)
) %>%
dplyr::ungroup()
res.plot %<>%
dplyr::mutate(genelabels = ifelse(threshold, gene, ""))
maxFC <- max(abs(res.plot$logFC))
if (!is.na(lfc_limit) && lfc_limit < maxFC) {
xlim <- lfc_limit
} else {
xlim <- maxFC * 1.04
}
ggplot2::ggplot(res.plot, ggplot2::aes(x = logFC_capped, y = adj.P.Val)) +
ggplot2::geom_point(data = subset(res.plot, out_of_bounds == 0),
ggplot2::aes(colour = threshold), alpha = 0.5) +
ggplot2::geom_point(data = subset(res.plot, out_of_bounds == -1),
ggplot2::aes(colour = threshold), shape = "\u25c4", size = 2) +
ggplot2::geom_point(data = subset(res.plot, out_of_bounds == 1),
ggplot2::aes(colour = threshold), shape = "\u25BA", size = 2) +
ggplot2::geom_hline(yintercept = 0.05, linetype = 2) +
ggplot2::geom_vline(xintercept = c(-1, 1), linetype = 2) +
ggplot2::scale_x_continuous(breaks = scales::pretty_breaks(),
limits = c(-xlim, xlim),
expand = ggplot2::expansion(0.01)) +
ggplot2::scale_y_continuous(trans = c("log10", "reverse"),
breaks = scales::log_breaks(),
labels = scales::scientific) +
ggplot2::ggtitle(title) +
ggplot2::xlab("log2 Fold Change") +
ggplot2::ylab("q-value") +
ggplot2::theme_minimal() +
ggplot2::theme(legend.position = "none")
}
############################################################
# Function: extract_gene_names
# Purpose: Parse FASTA headers and extract gene names (GN=... entries)
############################################################
extract_gene_names <- function(fasta_headers) {
sapply(fasta_headers, function(x) {
if (is.na(x) || x == "") return(NA_character_) # Handle empty or NA lines
# Find all GN=... matches
genes <- unlist(regmatches(x, gregexpr("GN=([^ ]+)", x)))
if (length(genes) == 0) return(NA_character_) # No GN found
# Remove "GN=" prefix
genes <- gsub("GN=", "", genes)
# Remove duplicates and collapse multiple entries
paste(unique(genes), collapse = ";")
})
}
C vs E
############################################################
# C vs E
############################################################
# Load MaxQuant output
df <- read.csv("./results_maxquant/hl540_cryo_galaxy.txt", sep = "\t")
df.prot <- df
# Remove reverse/contaminant/site-only identifications
df.prot <- df.prot[!df.prot$Reverse == "+", ]
df.prot <- df.prot[!df.prot$Potential.contaminant == "+", ]
df.prot <- df.prot[!df.prot$Only.identified.by.site == "+", ]
# Extract gene names using helper
df.prot$Gene.names <- extract_gene_names(df.prot$Fasta.headers)
# Extract LFQ intensity columns (as provided)
df.LFQ <- df.prot[, c(260:262, 266:271, 272:277, 278:283)]
df.LFQ[df.LFQ == 0] <- NA; print(colnames(df.LFQ))
## [1] "LFQ.intensity.HL_540_CR_C1_A_01.raw.thermo.raw"
## [2] "LFQ.intensity.HL_540_CR_C1_A_02.raw.thermo.raw"
## [3] "LFQ.intensity.HL_540_CR_C1_A_03.raw.thermo.raw"
## [4] "LFQ.intensity.HL_540_CR_C2_A_01.raw.thermo.raw"
## [5] "LFQ.intensity.HL_540_CR_C2_A_02.raw.thermo.raw"
## [6] "LFQ.intensity.HL_540_CR_C2_A_03.raw.thermo.raw"
## [7] "LFQ.intensity.HL_540_CR_C2_B_01.raw.thermo.raw"
## [8] "LFQ.intensity.HL_540_CR_C2_B_02.raw.thermo.raw"
## [9] "LFQ.intensity.HL_540_CR_C2_B_03.raw.thermo.raw"
## [10] "LFQ.intensity.HL_540_CR_E1_A_01.raw.thermo.raw"
## [11] "LFQ.intensity.HL_540_CR_E1_A_02.raw.thermo.raw"
## [12] "LFQ.intensity.HL_540_CR_E1_A_03.raw.thermo.raw"
## [13] "LFQ.intensity.HL_540_CR_E1_B_01.raw.thermo.raw"
## [14] "LFQ.intensity.HL_540_CR_E1_B_02.raw.thermo.raw"
## [15] "LFQ.intensity.HL_540_CR_E1_B_03.raw.thermo.raw"
## [16] "LFQ.intensity.HL_540_CR_E2_A_01.raw.thermo.raw"
## [17] "LFQ.intensity.HL_540_CR_E2_A_02.raw.thermo.raw"
## [18] "LFQ.intensity.HL_540_CR_E2_A_03.raw.thermo.raw"
## [19] "LFQ.intensity.HL_540_CR_E2_B_01.raw.thermo.raw"
## [20] "LFQ.intensity.HL_540_CR_E2_B_02.raw.thermo.raw"
## [21] "LFQ.intensity.HL_540_CR_E2_B_03.raw.thermo.raw"
# Set protein IDs as row names
rownames(df.LFQ) <- df.prot$Majority.protein.IDs
# Transpose: rows = samples, cols = proteins
df.LFQ <- t(df.LFQ)
# Track columns that are entirely NA (align with peptide-count construction)
keep_cols <- colSums(!is.na(df.LFQ)) > 0
removed_col_indices <- which(!keep_cols)
# Remove columns that are all NA
df.LFQ.clean <- df.LFQ[, colSums(!is.na(df.LFQ)) > 0]
# Log2 transform
df.LFQ.clean <- log2(df.LFQ.clean)
# Missing-value imputation (local least squares)
result <- llsImpute(as.matrix(df.LFQ.clean), k = 10, correlation = "pearson", allVariables = TRUE)
df.LFQ <- as.data.frame(completeObs(result))
df.LFQ <- as.data.frame(t(df.LFQ))
# Valid observations per group
df.LFQ$valid_A <- apply(df.LFQ[, 1:9], 1, function(x) sum(!is.na(x)))
df.LFQ$valid_B <- apply(df.LFQ[,10:21], 1, function(x) sum(!is.na(x)))
# Protein filter by minimal presence
df.LFQ.filter <- df.LFQ[df.LFQ$valid_A >= 1 | df.LFQ$valid_B >= 2, 1:21]
# Inspect peptide-count columns for these samples
print(colnames(df.prot[, c(59:61, 65:70, 71:76, 77:82)]))
## [1] "Razor...unique.peptides.HL_540_CR_C1_A_01.raw.thermo.raw"
## [2] "Razor...unique.peptides.HL_540_CR_C1_A_02.raw.thermo.raw"
## [3] "Razor...unique.peptides.HL_540_CR_C1_A_03.raw.thermo.raw"
## [4] "Razor...unique.peptides.HL_540_CR_C2_A_01.raw.thermo.raw"
## [5] "Razor...unique.peptides.HL_540_CR_C2_A_02.raw.thermo.raw"
## [6] "Razor...unique.peptides.HL_540_CR_C2_A_03.raw.thermo.raw"
## [7] "Razor...unique.peptides.HL_540_CR_C2_B_01.raw.thermo.raw"
## [8] "Razor...unique.peptides.HL_540_CR_C2_B_02.raw.thermo.raw"
## [9] "Razor...unique.peptides.HL_540_CR_C2_B_03.raw.thermo.raw"
## [10] "Razor...unique.peptides.HL_540_CR_E1_A_01.raw.thermo.raw"
## [11] "Razor...unique.peptides.HL_540_CR_E1_A_02.raw.thermo.raw"
## [12] "Razor...unique.peptides.HL_540_CR_E1_A_03.raw.thermo.raw"
## [13] "Razor...unique.peptides.HL_540_CR_E1_B_01.raw.thermo.raw"
## [14] "Razor...unique.peptides.HL_540_CR_E1_B_02.raw.thermo.raw"
## [15] "Razor...unique.peptides.HL_540_CR_E1_B_03.raw.thermo.raw"
## [16] "Razor...unique.peptides.HL_540_CR_E2_A_01.raw.thermo.raw"
## [17] "Razor...unique.peptides.HL_540_CR_E2_A_02.raw.thermo.raw"
## [18] "Razor...unique.peptides.HL_540_CR_E2_A_03.raw.thermo.raw"
## [19] "Razor...unique.peptides.HL_540_CR_E2_B_01.raw.thermo.raw"
## [20] "Razor...unique.peptides.HL_540_CR_E2_B_02.raw.thermo.raw"
## [21] "Razor...unique.peptides.HL_540_CR_E2_B_03.raw.thermo.raw"
# Build peptide-count table (unique+razor peptides, min across samples)
pep.count.table <- data.frame(
count = rowMins(as.matrix(df.prot[-removed_col_indices, c(59:61, 65:70, 71:76, 77:82)])),
row.names = df.prot$Majority.protein.IDs[-removed_col_indices]
)
# Add pseudocount to avoid zeros
pep.count.table$count <- pep.count.table$count + 1
# Matrix for limma/DEqMS
protein.matrix <- as.matrix(df.LFQ.filter)
# Factors for class and replicate (batch)
class <- factor(c(rep("C", 9), rep("E", 12)))
replicate <- factor(c(rep("rep1", 3), rep("rep2", 6),
rep("rep1", 6), rep("rep2", 6)))
# Design without intercept; replicate as covariate
design <- model.matrix(~ 0 + class + replicate)
colnames(design)[1:2] <- c("classC", "classE")
# Optional: contrast object (not strictly needed since run_deqms_pipeline uses contrast_string)
cont <- limma::makeContrasts(classC - classE, levels = design)
# Run pipeline
fit_result <- run_deqms_pipeline(
protein.matrix, design, pep.count.table,
contrast_string = "classC-classE"
)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 1.6234e-16

# Collect results
DEqMS.results <- outputResult(fit_result, coef_col = 1)
rownames(df.prot) <- df.prot$Majority.protein.IDs
DEqMS.results$Gene.name <- df.prot[DEqMS.results$gene, ]$Gene.names
# Export with timestamp
timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
write.csv(DEqMS.results,
file = paste0("/proj/proteomics/7_combinedHL/evaluation_paper/abc_2025/raw_log2FCdata/hl540_cryo/data_", timestamp, ".csv"))
# Process: remove NA logFC and round numeric columns
processed_df <- DEqMS.results %>%
dplyr::filter(!is.na(logFC)) %>%
dplyr::mutate(across(where(is.numeric), ~ round(.x, 3)))
datatable(processed_df, rownames = FALSE)
# Proteins of interest (assumes 'proteins_of_interest' is defined upstream)
processed_df_interest <- processed_df %>%
dplyr::filter(Gene.name %in% proteins_of_interest)
datatable(processed_df_interest, rownames = FALSE, caption = "proteins of interest")
# Volcano plot
vp_lfc_limit <- 5
plot_volcano(DEqMS.results, "C vs E", vp_lfc_limit)

# Counts for strongly regulated proteins
sig <- DEqMS.results %>% dplyr::filter(sca.P.Value <= 0.05)
n_up <- sum(sig$logFC >= 1.0, na.rm = TRUE)
n_down <- sum(sig$logFC <= -1.0, na.rm = TRUE)
n_total <- sum(abs(sig$logFC) >= 1.0, na.rm = TRUE)
cat("Strongly upregulated (logFC ≥ 1.0):", n_up, "\n")
## Strongly upregulated (logFC ≥ 1.0): 155
cat("Strongly downregulated (logFC ≤ -1.0):", n_down, "\n")
## Strongly downregulated (logFC ≤ -1.0): 132
cat("Total |logFC| ≥ 1.0:", n_total, "\n")
## Total |logFC| ≥ 1.0: 287
C vs RE
############################################################
# C vs RE
############################################################
df <- read.csv("./results_maxquant/hl540_cryo_galaxy.txt", sep = "\t")
df.prot <- df
df.prot <- df.prot[!df.prot$Reverse == "+", ]
df.prot <- df.prot[!df.prot$Potential.contaminant == "+", ]
df.prot <- df.prot[!df.prot$Only.identified.by.site == "+", ]
df.prot$Gene.names <- extract_gene_names(df.prot$Fasta.headers)
df.LFQ <- df.prot[, c(260:262, 266:271, 296:301, 302:307)]
df.LFQ[df.LFQ == 0] <- NA; print(colnames(df.LFQ))
## [1] "LFQ.intensity.HL_540_CR_C1_A_01.raw.thermo.raw"
## [2] "LFQ.intensity.HL_540_CR_C1_A_02.raw.thermo.raw"
## [3] "LFQ.intensity.HL_540_CR_C1_A_03.raw.thermo.raw"
## [4] "LFQ.intensity.HL_540_CR_C2_A_01.raw.thermo.raw"
## [5] "LFQ.intensity.HL_540_CR_C2_A_02.raw.thermo.raw"
## [6] "LFQ.intensity.HL_540_CR_C2_A_03.raw.thermo.raw"
## [7] "LFQ.intensity.HL_540_CR_C2_B_01.raw.thermo.raw"
## [8] "LFQ.intensity.HL_540_CR_C2_B_02.raw.thermo.raw"
## [9] "LFQ.intensity.HL_540_CR_C2_B_03.raw.thermo.raw"
## [10] "LFQ.intensity.HL_540_CR_RE1_A_01.raw.thermo.raw"
## [11] "LFQ.intensity.HL_540_CR_RE1_A_02.raw.thermo.raw"
## [12] "LFQ.intensity.HL_540_CR_RE1_A_03.raw.thermo.raw"
## [13] "LFQ.intensity.HL_540_CR_RE1_B_01.raw.thermo.raw"
## [14] "LFQ.intensity.HL_540_CR_RE1_B_02.raw.thermo.raw"
## [15] "LFQ.intensity.HL_540_CR_RE1_B_03.raw.thermo.raw"
## [16] "LFQ.intensity.HL_540_CR_RE2_A_01.raw.thermo.raw"
## [17] "LFQ.intensity.HL_540_CR_RE2_A_02.raw.thermo.raw"
## [18] "LFQ.intensity.HL_540_CR_RE2_A_03.raw.thermo.raw"
## [19] "LFQ.intensity.HL_540_CR_RE2_B_01.raw.thermo.raw"
## [20] "LFQ.intensity.HL_540_CR_RE2_B_02.raw.thermo.raw"
## [21] "LFQ.intensity.HL_540_CR_RE2_B_03.raw.thermo.raw"
rownames(df.LFQ) <- df.prot$Majority.protein.IDs
df.LFQ <- t(df.LFQ)
keep_cols <- colSums(!is.na(df.LFQ)) > 0
removed_col_indices <- which(!keep_cols)
df.LFQ.clean <- df.LFQ[, colSums(!is.na(df.LFQ)) > 0]
df.LFQ.clean <- log2(df.LFQ.clean)
result <- llsImpute(as.matrix(df.LFQ.clean), k = 10, correlation = "pearson", allVariables = TRUE)
df.LFQ <- as.data.frame(completeObs(result))
df.LFQ <- as.data.frame(t(df.LFQ))
df.LFQ$valid_A <- apply(df.LFQ[, 1:9], 1, function(x) sum(!is.na(x)))
df.LFQ$valid_B <- apply(df.LFQ[, 10:21], 1, function(x) sum(!is.na(x)))
df.LFQ.filter <- df.LFQ[df.LFQ$valid_A >= 1 | df.LFQ$valid_B >= 2, 1:21]
print(colnames(df.prot[, c(59:61, 65:70, 95:100, 101:106)]))
## [1] "Razor...unique.peptides.HL_540_CR_C1_A_01.raw.thermo.raw"
## [2] "Razor...unique.peptides.HL_540_CR_C1_A_02.raw.thermo.raw"
## [3] "Razor...unique.peptides.HL_540_CR_C1_A_03.raw.thermo.raw"
## [4] "Razor...unique.peptides.HL_540_CR_C2_A_01.raw.thermo.raw"
## [5] "Razor...unique.peptides.HL_540_CR_C2_A_02.raw.thermo.raw"
## [6] "Razor...unique.peptides.HL_540_CR_C2_A_03.raw.thermo.raw"
## [7] "Razor...unique.peptides.HL_540_CR_C2_B_01.raw.thermo.raw"
## [8] "Razor...unique.peptides.HL_540_CR_C2_B_02.raw.thermo.raw"
## [9] "Razor...unique.peptides.HL_540_CR_C2_B_03.raw.thermo.raw"
## [10] "Razor...unique.peptides.HL_540_CR_RE1_A_01.raw.thermo.raw"
## [11] "Razor...unique.peptides.HL_540_CR_RE1_A_02.raw.thermo.raw"
## [12] "Razor...unique.peptides.HL_540_CR_RE1_A_03.raw.thermo.raw"
## [13] "Razor...unique.peptides.HL_540_CR_RE1_B_01.raw.thermo.raw"
## [14] "Razor...unique.peptides.HL_540_CR_RE1_B_02.raw.thermo.raw"
## [15] "Razor...unique.peptides.HL_540_CR_RE1_B_03.raw.thermo.raw"
## [16] "Razor...unique.peptides.HL_540_CR_RE2_A_01.raw.thermo.raw"
## [17] "Razor...unique.peptides.HL_540_CR_RE2_A_02.raw.thermo.raw"
## [18] "Razor...unique.peptides.HL_540_CR_RE2_A_03.raw.thermo.raw"
## [19] "Razor...unique.peptides.HL_540_CR_RE2_B_01.raw.thermo.raw"
## [20] "Razor...unique.peptides.HL_540_CR_RE2_B_02.raw.thermo.raw"
## [21] "Razor...unique.peptides.HL_540_CR_RE2_B_03.raw.thermo.raw"
pep.count.table <- data.frame(
count = rowMins(as.matrix(df.prot[-removed_col_indices, c(59:61, 65:70, 95:100, 101:106)])),
row.names = df.prot$Majority.protein.IDs[-removed_col_indices]
)
pep.count.table$count <- pep.count.table$count + 1
protein.matrix <- as.matrix(df.LFQ.filter)
class <- factor(c(rep("C", 9), rep("RE", 12)))
replicate <- factor(c(rep("rep1", 3), rep("rep2", 6),
rep("rep1", 6), rep("rep2", 6)))
design <- model.matrix(~ 0 + class + replicate)
colnames(design)[1:2] <- c("classC", "classRE")
fit_result <- run_deqms_pipeline(
protein.matrix, design, pep.count.table,
contrast_string = "classC-classRE"
)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 1.048e-16

DEqMS.results <- outputResult(fit_result, coef_col = 1)
rownames(df.prot) <- df.prot$Majority.protein.IDs
DEqMS.results$Gene.name <- df.prot[DEqMS.results$gene, ]$Gene.names
timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
write.csv(DEqMS.results,
file = paste0("/proj/proteomics/7_combinedHL/evaluation_paper/abc_2025/raw_log2FCdata/hl540_cryo/data_", timestamp, ".csv"))
processed_df <- DEqMS.results %>%
dplyr::filter(!is.na(logFC)) %>%
dplyr::mutate(across(where(is.numeric), ~ round(.x, 3)))
datatable(processed_df, rownames = FALSE)
processed_df_interest <- processed_df %>%
dplyr::filter(Gene.name %in% proteins_of_interest)
datatable(processed_df_interest, rownames = FALSE, caption = "proteins of interest")
vp_lfc_limit <- 5
plot_volcano(DEqMS.results, "C vs RE", vp_lfc_limit)

sig <- DEqMS.results %>% dplyr::filter(sca.P.Value <= 0.05)
n_up <- sum(sig$logFC >= 1.0, na.rm = TRUE)
n_down <- sum(sig$logFC <= -1.0, na.rm = TRUE)
n_total <- sum(abs(sig$logFC) >= 1.0, na.rm = TRUE)
cat("Strongly upregulated (logFC ≥ 1.0):", n_up, "\n")
## Strongly upregulated (logFC ≥ 1.0): 228
cat("Strongly downregulated (logFC ≤ -1.0):", n_down, "\n")
## Strongly downregulated (logFC ≤ -1.0): 162
cat("Total |logFC| ≥ 1.0:", n_total, "\n")
## Total |logFC| ≥ 1.0: 390
C vs R
############################################################
# C vs R
############################################################
df <- read.csv("./results_maxquant/hl540_cryo_galaxy.txt", sep = "\t")
df.prot <- df
df.prot <- df.prot[!df.prot$Reverse == "+", ]
df.prot <- df.prot[!df.prot$Potential.contaminant == "+", ]
df.prot <- df.prot[!df.prot$Only.identified.by.site == "+", ]
df.prot$Gene.names <- extract_gene_names(df.prot$Fasta.headers)
df.LFQ <- df.prot[, c(260:262, 266:271, 284:289, 290:295)]
df.LFQ[df.LFQ == 0] <- NA; print(colnames(df.LFQ))
## [1] "LFQ.intensity.HL_540_CR_C1_A_01.raw.thermo.raw"
## [2] "LFQ.intensity.HL_540_CR_C1_A_02.raw.thermo.raw"
## [3] "LFQ.intensity.HL_540_CR_C1_A_03.raw.thermo.raw"
## [4] "LFQ.intensity.HL_540_CR_C2_A_01.raw.thermo.raw"
## [5] "LFQ.intensity.HL_540_CR_C2_A_02.raw.thermo.raw"
## [6] "LFQ.intensity.HL_540_CR_C2_A_03.raw.thermo.raw"
## [7] "LFQ.intensity.HL_540_CR_C2_B_01.raw.thermo.raw"
## [8] "LFQ.intensity.HL_540_CR_C2_B_02.raw.thermo.raw"
## [9] "LFQ.intensity.HL_540_CR_C2_B_03.raw.thermo.raw"
## [10] "LFQ.intensity.HL_540_CR_R1_A_01.raw.thermo.raw"
## [11] "LFQ.intensity.HL_540_CR_R1_A_02.raw.thermo.raw"
## [12] "LFQ.intensity.HL_540_CR_R1_A_03.raw.thermo.raw"
## [13] "LFQ.intensity.HL_540_CR_R1_B_01.raw.thermo.raw"
## [14] "LFQ.intensity.HL_540_CR_R1_B_02.raw.thermo.raw"
## [15] "LFQ.intensity.HL_540_CR_R1_B_03.raw.thermo.raw"
## [16] "LFQ.intensity.HL_540_CR_R2_A_01.raw.thermo.raw"
## [17] "LFQ.intensity.HL_540_CR_R2_A_02.raw.thermo.raw"
## [18] "LFQ.intensity.HL_540_CR_R2_A_03.raw.thermo.raw"
## [19] "LFQ.intensity.HL_540_CR_R2_B_01.raw.thermo.raw"
## [20] "LFQ.intensity.HL_540_CR_R2_B_02.raw.thermo.raw"
## [21] "LFQ.intensity.HL_540_CR_R2_B_03.raw.thermo.raw"
rownames(df.LFQ) <- df.prot$Majority.protein.IDs
df.LFQ <- t(df.LFQ)
keep_cols <- colSums(!is.na(df.LFQ)) > 0
removed_col_indices <- which(!keep_cols)
df.LFQ.clean <- df.LFQ[, colSums(!is.na(df.LFQ)) > 0]
df.LFQ.clean <- log2(df.LFQ.clean)
result <- llsImpute(as.matrix(df.LFQ.clean), k = 10, correlation = "pearson", allVariables = TRUE)
df.LFQ <- as.data.frame(completeObs(result))
df.LFQ <- as.data.frame(t(df.LFQ))
df.LFQ$valid_A <- apply(df.LFQ[, 1:9], 1, function(x) sum(!is.na(x)))
df.LFQ$valid_B <- apply(df.LFQ[, 10:21], 1, function(x) sum(!is.na(x)))
df.LFQ.filter <- df.LFQ[df.LFQ$valid_A >= 1 | df.LFQ$valid_B >= 2, 1:21]
print(colnames(df.prot[, c(59:61, 65:70, 83:88, 89:94)]))
## [1] "Razor...unique.peptides.HL_540_CR_C1_A_01.raw.thermo.raw"
## [2] "Razor...unique.peptides.HL_540_CR_C1_A_02.raw.thermo.raw"
## [3] "Razor...unique.peptides.HL_540_CR_C1_A_03.raw.thermo.raw"
## [4] "Razor...unique.peptides.HL_540_CR_C2_A_01.raw.thermo.raw"
## [5] "Razor...unique.peptides.HL_540_CR_C2_A_02.raw.thermo.raw"
## [6] "Razor...unique.peptides.HL_540_CR_C2_A_03.raw.thermo.raw"
## [7] "Razor...unique.peptides.HL_540_CR_C2_B_01.raw.thermo.raw"
## [8] "Razor...unique.peptides.HL_540_CR_C2_B_02.raw.thermo.raw"
## [9] "Razor...unique.peptides.HL_540_CR_C2_B_03.raw.thermo.raw"
## [10] "Razor...unique.peptides.HL_540_CR_R1_A_01.raw.thermo.raw"
## [11] "Razor...unique.peptides.HL_540_CR_R1_A_02.raw.thermo.raw"
## [12] "Razor...unique.peptides.HL_540_CR_R1_A_03.raw.thermo.raw"
## [13] "Razor...unique.peptides.HL_540_CR_R1_B_01.raw.thermo.raw"
## [14] "Razor...unique.peptides.HL_540_CR_R1_B_02.raw.thermo.raw"
## [15] "Razor...unique.peptides.HL_540_CR_R1_B_03.raw.thermo.raw"
## [16] "Razor...unique.peptides.HL_540_CR_R2_A_01.raw.thermo.raw"
## [17] "Razor...unique.peptides.HL_540_CR_R2_A_02.raw.thermo.raw"
## [18] "Razor...unique.peptides.HL_540_CR_R2_A_03.raw.thermo.raw"
## [19] "Razor...unique.peptides.HL_540_CR_R2_B_01.raw.thermo.raw"
## [20] "Razor...unique.peptides.HL_540_CR_R2_B_02.raw.thermo.raw"
## [21] "Razor...unique.peptides.HL_540_CR_R2_B_03.raw.thermo.raw"
pep.count.table <- data.frame(
count = rowMins(as.matrix(df.prot[-removed_col_indices, c(59:61, 65:70, 83:88, 89:94)])),
row.names = df.prot$Majority.protein.IDs[-removed_col_indices]
)
pep.count.table$count <- pep.count.table$count + 1
protein.matrix <- as.matrix(df.LFQ.filter)
class <- factor(c(rep("C", 9), rep("R", 12)))
replicate <- factor(c(rep("rep1", 3), rep("rep2", 6),
rep("rep1", 6), rep("rep2", 6)))
design <- model.matrix(~ 0 + class + replicate)
colnames(design)[1:2] <- c("classC", "classR")
fit_result <- run_deqms_pipeline(
protein.matrix, design, pep.count.table,
contrast_string = "classC-classR"
)

DEqMS.results <- outputResult(fit_result, coef_col = 1)
rownames(df.prot) <- df.prot$Majority.protein.IDs
DEqMS.results$Gene.name <- df.prot[DEqMS.results$gene, ]$Gene.names
timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
write.csv(DEqMS.results,
file = paste0("/proj/proteomics/7_combinedHL/evaluation_paper/abc_2025/raw_log2FCdata/hl540_cryo/data_", timestamp, ".csv"))
processed_df <- DEqMS.results %>%
dplyr::filter(!is.na(logFC)) %>%
dplyr::mutate(across(where(is.numeric), ~ round(.x, 3)))
datatable(processed_df, rownames = FALSE)
processed_df_interest <- processed_df %>%
dplyr::filter(Gene.name %in% proteins_of_interest)
datatable(processed_df_interest, rownames = FALSE, caption = "proteins of interest")
vp_lfc_limit <- 5
plot_volcano(DEqMS.results, "C vs R", vp_lfc_limit)

sig <- DEqMS.results %>% dplyr::filter(sca.P.Value <= 0.05)
n_up <- sum(sig$logFC >= 1.0, na.rm = TRUE)
n_down <- sum(sig$logFC <= -1.0, na.rm = TRUE)
n_total <- sum(abs(sig$logFC) >= 1.0, na.rm = TRUE)
cat("Strongly upregulated (logFC ≥ 1.0):", n_up, "\n")
## Strongly upregulated (logFC ≥ 1.0): 15
cat("Strongly downregulated (logFC ≤ -1.0):", n_down, "\n")
## Strongly downregulated (logFC ≤ -1.0): 21
cat("Total |logFC| ≥ 1.0:", n_total, "\n")
## Total |logFC| ≥ 1.0: 36
E vs R
############################################################
# E vs R
############################################################
df <- read.csv("./results_maxquant/hl540_cryo_galaxy.txt", sep = "\t")
df.prot <- df
df.prot <- df.prot[!df.prot$Reverse == "+", ]
df.prot <- df.prot[!df.prot$Potential.contaminant == "+", ]
df.prot <- df.prot[!df.prot$Only.identified.by.site == "+", ]
df.prot$Gene.names <- extract_gene_names(df.prot$Fasta.headers)
df.LFQ <- df.prot[, c(272:277, 278:283, 284:289, 290:295)]
df.LFQ[df.LFQ == 0] <- NA; print(colnames(df.LFQ))
## [1] "LFQ.intensity.HL_540_CR_E1_A_01.raw.thermo.raw"
## [2] "LFQ.intensity.HL_540_CR_E1_A_02.raw.thermo.raw"
## [3] "LFQ.intensity.HL_540_CR_E1_A_03.raw.thermo.raw"
## [4] "LFQ.intensity.HL_540_CR_E1_B_01.raw.thermo.raw"
## [5] "LFQ.intensity.HL_540_CR_E1_B_02.raw.thermo.raw"
## [6] "LFQ.intensity.HL_540_CR_E1_B_03.raw.thermo.raw"
## [7] "LFQ.intensity.HL_540_CR_E2_A_01.raw.thermo.raw"
## [8] "LFQ.intensity.HL_540_CR_E2_A_02.raw.thermo.raw"
## [9] "LFQ.intensity.HL_540_CR_E2_A_03.raw.thermo.raw"
## [10] "LFQ.intensity.HL_540_CR_E2_B_01.raw.thermo.raw"
## [11] "LFQ.intensity.HL_540_CR_E2_B_02.raw.thermo.raw"
## [12] "LFQ.intensity.HL_540_CR_E2_B_03.raw.thermo.raw"
## [13] "LFQ.intensity.HL_540_CR_R1_A_01.raw.thermo.raw"
## [14] "LFQ.intensity.HL_540_CR_R1_A_02.raw.thermo.raw"
## [15] "LFQ.intensity.HL_540_CR_R1_A_03.raw.thermo.raw"
## [16] "LFQ.intensity.HL_540_CR_R1_B_01.raw.thermo.raw"
## [17] "LFQ.intensity.HL_540_CR_R1_B_02.raw.thermo.raw"
## [18] "LFQ.intensity.HL_540_CR_R1_B_03.raw.thermo.raw"
## [19] "LFQ.intensity.HL_540_CR_R2_A_01.raw.thermo.raw"
## [20] "LFQ.intensity.HL_540_CR_R2_A_02.raw.thermo.raw"
## [21] "LFQ.intensity.HL_540_CR_R2_A_03.raw.thermo.raw"
## [22] "LFQ.intensity.HL_540_CR_R2_B_01.raw.thermo.raw"
## [23] "LFQ.intensity.HL_540_CR_R2_B_02.raw.thermo.raw"
## [24] "LFQ.intensity.HL_540_CR_R2_B_03.raw.thermo.raw"
rownames(df.LFQ) <- df.prot$Majority.protein.IDs
df.LFQ <- t(df.LFQ)
keep_cols <- colSums(!is.na(df.LFQ)) > 0
removed_col_indices <- which(!keep_cols)
df.LFQ.clean <- df.LFQ[, colSums(!is.na(df.LFQ)) > 0]
df.LFQ.clean <- log2(df.LFQ.clean)
result <- llsImpute(as.matrix(df.LFQ.clean), k = 10, correlation = "pearson", allVariables = TRUE)
df.LFQ <- as.data.frame(completeObs(result))
df.LFQ <- as.data.frame(t(df.LFQ))
df.LFQ$valid_A <- apply(df.LFQ[, 1:12], 1, function(x) sum(!is.na(x)))
df.LFQ$valid_B <- apply(df.LFQ[, 13:24], 1, function(x) sum(!is.na(x)))
df.LFQ.filter <- df.LFQ[df.LFQ$valid_A >= 2 | df.LFQ$valid_B >= 2, 1:24]
print(colnames(df.prot[, c(71:76, 77:82, 83:88, 89:94)]))
## [1] "Razor...unique.peptides.HL_540_CR_E1_A_01.raw.thermo.raw"
## [2] "Razor...unique.peptides.HL_540_CR_E1_A_02.raw.thermo.raw"
## [3] "Razor...unique.peptides.HL_540_CR_E1_A_03.raw.thermo.raw"
## [4] "Razor...unique.peptides.HL_540_CR_E1_B_01.raw.thermo.raw"
## [5] "Razor...unique.peptides.HL_540_CR_E1_B_02.raw.thermo.raw"
## [6] "Razor...unique.peptides.HL_540_CR_E1_B_03.raw.thermo.raw"
## [7] "Razor...unique.peptides.HL_540_CR_E2_A_01.raw.thermo.raw"
## [8] "Razor...unique.peptides.HL_540_CR_E2_A_02.raw.thermo.raw"
## [9] "Razor...unique.peptides.HL_540_CR_E2_A_03.raw.thermo.raw"
## [10] "Razor...unique.peptides.HL_540_CR_E2_B_01.raw.thermo.raw"
## [11] "Razor...unique.peptides.HL_540_CR_E2_B_02.raw.thermo.raw"
## [12] "Razor...unique.peptides.HL_540_CR_E2_B_03.raw.thermo.raw"
## [13] "Razor...unique.peptides.HL_540_CR_R1_A_01.raw.thermo.raw"
## [14] "Razor...unique.peptides.HL_540_CR_R1_A_02.raw.thermo.raw"
## [15] "Razor...unique.peptides.HL_540_CR_R1_A_03.raw.thermo.raw"
## [16] "Razor...unique.peptides.HL_540_CR_R1_B_01.raw.thermo.raw"
## [17] "Razor...unique.peptides.HL_540_CR_R1_B_02.raw.thermo.raw"
## [18] "Razor...unique.peptides.HL_540_CR_R1_B_03.raw.thermo.raw"
## [19] "Razor...unique.peptides.HL_540_CR_R2_A_01.raw.thermo.raw"
## [20] "Razor...unique.peptides.HL_540_CR_R2_A_02.raw.thermo.raw"
## [21] "Razor...unique.peptides.HL_540_CR_R2_A_03.raw.thermo.raw"
## [22] "Razor...unique.peptides.HL_540_CR_R2_B_01.raw.thermo.raw"
## [23] "Razor...unique.peptides.HL_540_CR_R2_B_02.raw.thermo.raw"
## [24] "Razor...unique.peptides.HL_540_CR_R2_B_03.raw.thermo.raw"
pep.count.table <- data.frame(
count = rowMins(as.matrix(df.prot[-removed_col_indices, c(71:76, 77:82, 83:88, 89:94)])),
row.names = df.prot$Majority.protein.IDs[-removed_col_indices]
)
pep.count.table$count <- pep.count.table$count + 1
protein.matrix <- as.matrix(df.LFQ.filter)
class <- factor(c(rep("E", 12), rep("R", 12)))
replicate <- factor(c(rep("rep1", 6), rep("rep2", 6),
rep("rep1", 6), rep("rep2", 6)))
design <- model.matrix(~ 0 + class + replicate)
colnames(design)[1:2] <- c("classE", "classR")
fit_result <- run_deqms_pipeline(
protein.matrix, design, pep.count.table,
contrast_string = "classE-classR"
)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 9.4178e-17

DEqMS.results <- outputResult(fit_result, coef_col = 1)
rownames(df.prot) <- df.prot$Majority.protein.IDs
DEqMS.results$Gene.name <- df.prot[DEqMS.results$gene, ]$Gene.names
timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
write.csv(DEqMS.results,
file = paste0("/proj/proteomics/7_combinedHL/evaluation_paper/abc_2025/raw_log2FCdata/hl540_cryo/data_", timestamp, ".csv"))
processed_df <- DEqMS.results %>%
dplyr::filter(!is.na(logFC)) %>%
dplyr::mutate(across(where(is.numeric), ~ round(.x, 3)))
datatable(processed_df, rownames = FALSE)
processed_df_interest <- processed_df %>%
dplyr::filter(Gene.name %in% proteins_of_interest)
datatable(processed_df_interest, rownames = FALSE, caption = "proteins of interest")
vp_lfc_limit <- 5
plot_volcano(DEqMS.results, "E vs R", vp_lfc_limit)

sig <- DEqMS.results %>% dplyr::filter(sca.P.Value <= 0.05)
n_up <- sum(sig$logFC >= 1.0, na.rm = TRUE)
n_down <- sum(sig$logFC <= -1.0, na.rm = TRUE)
n_total <- sum(abs(sig$logFC) >= 1.0, na.rm = TRUE)
cat("Strongly upregulated (logFC ≥ 1.0):", n_up, "\n")
## Strongly upregulated (logFC ≥ 1.0): 80
cat("Strongly downregulated (logFC ≤ -1.0):", n_down, "\n")
## Strongly downregulated (logFC ≤ -1.0): 119
cat("Total |logFC| ≥ 1.0:", n_total, "\n")
## Total |logFC| ≥ 1.0: 199
E vs RE
############################################################
# E vs RE
############################################################
df <- read.csv("./results_maxquant/hl540_cryo_galaxy.txt", sep = "\t")
df.prot <- df
df.prot <- df.prot[!df.prot$Reverse == "+", ]
df.prot <- df.prot[!df.prot$Potential.contaminant == "+", ]
df.prot <- df.prot[!df.prot$Only.identified.by.site == "+", ]
df.prot$Gene.names <- extract_gene_names(df.prot$Fasta.headers)
df.LFQ <- df.prot[, c(272:277, 278:283, 296:301, 302:307)]
df.LFQ[df.LFQ == 0] <- NA; print(colnames(df.LFQ))
## [1] "LFQ.intensity.HL_540_CR_E1_A_01.raw.thermo.raw"
## [2] "LFQ.intensity.HL_540_CR_E1_A_02.raw.thermo.raw"
## [3] "LFQ.intensity.HL_540_CR_E1_A_03.raw.thermo.raw"
## [4] "LFQ.intensity.HL_540_CR_E1_B_01.raw.thermo.raw"
## [5] "LFQ.intensity.HL_540_CR_E1_B_02.raw.thermo.raw"
## [6] "LFQ.intensity.HL_540_CR_E1_B_03.raw.thermo.raw"
## [7] "LFQ.intensity.HL_540_CR_E2_A_01.raw.thermo.raw"
## [8] "LFQ.intensity.HL_540_CR_E2_A_02.raw.thermo.raw"
## [9] "LFQ.intensity.HL_540_CR_E2_A_03.raw.thermo.raw"
## [10] "LFQ.intensity.HL_540_CR_E2_B_01.raw.thermo.raw"
## [11] "LFQ.intensity.HL_540_CR_E2_B_02.raw.thermo.raw"
## [12] "LFQ.intensity.HL_540_CR_E2_B_03.raw.thermo.raw"
## [13] "LFQ.intensity.HL_540_CR_RE1_A_01.raw.thermo.raw"
## [14] "LFQ.intensity.HL_540_CR_RE1_A_02.raw.thermo.raw"
## [15] "LFQ.intensity.HL_540_CR_RE1_A_03.raw.thermo.raw"
## [16] "LFQ.intensity.HL_540_CR_RE1_B_01.raw.thermo.raw"
## [17] "LFQ.intensity.HL_540_CR_RE1_B_02.raw.thermo.raw"
## [18] "LFQ.intensity.HL_540_CR_RE1_B_03.raw.thermo.raw"
## [19] "LFQ.intensity.HL_540_CR_RE2_A_01.raw.thermo.raw"
## [20] "LFQ.intensity.HL_540_CR_RE2_A_02.raw.thermo.raw"
## [21] "LFQ.intensity.HL_540_CR_RE2_A_03.raw.thermo.raw"
## [22] "LFQ.intensity.HL_540_CR_RE2_B_01.raw.thermo.raw"
## [23] "LFQ.intensity.HL_540_CR_RE2_B_02.raw.thermo.raw"
## [24] "LFQ.intensity.HL_540_CR_RE2_B_03.raw.thermo.raw"
rownames(df.LFQ) <- df.prot$Majority.protein.IDs
df.LFQ <- t(df.LFQ)
keep_cols <- colSums(!is.na(df.LFQ)) > 0
removed_col_indices <- which(!keep_cols)
df.LFQ.clean <- df.LFQ[, colSums(!is.na(df.LFQ)) > 0]
df.LFQ.clean <- log2(df.LFQ.clean)
result <- llsImpute(as.matrix(df.LFQ.clean), k = 10, correlation = "pearson", allVariables = TRUE)
df.LFQ <- as.data.frame(completeObs(result))
df.LFQ <- as.data.frame(t(df.LFQ))
df.LFQ$valid_A <- apply(df.LFQ[, 1:12], 1, function(x) sum(!is.na(x)))
df.LFQ$valid_B <- apply(df.LFQ[, 13:24], 1, function(x) sum(!is.na(x)))
df.LFQ.filter <- df.LFQ[df.LFQ$valid_A >= 2 | df.LFQ$valid_B >= 2, 1:24]
print(colnames(df.prot[, c(71:76, 77:82, 95:100, 101:106)]))
## [1] "Razor...unique.peptides.HL_540_CR_E1_A_01.raw.thermo.raw"
## [2] "Razor...unique.peptides.HL_540_CR_E1_A_02.raw.thermo.raw"
## [3] "Razor...unique.peptides.HL_540_CR_E1_A_03.raw.thermo.raw"
## [4] "Razor...unique.peptides.HL_540_CR_E1_B_01.raw.thermo.raw"
## [5] "Razor...unique.peptides.HL_540_CR_E1_B_02.raw.thermo.raw"
## [6] "Razor...unique.peptides.HL_540_CR_E1_B_03.raw.thermo.raw"
## [7] "Razor...unique.peptides.HL_540_CR_E2_A_01.raw.thermo.raw"
## [8] "Razor...unique.peptides.HL_540_CR_E2_A_02.raw.thermo.raw"
## [9] "Razor...unique.peptides.HL_540_CR_E2_A_03.raw.thermo.raw"
## [10] "Razor...unique.peptides.HL_540_CR_E2_B_01.raw.thermo.raw"
## [11] "Razor...unique.peptides.HL_540_CR_E2_B_02.raw.thermo.raw"
## [12] "Razor...unique.peptides.HL_540_CR_E2_B_03.raw.thermo.raw"
## [13] "Razor...unique.peptides.HL_540_CR_RE1_A_01.raw.thermo.raw"
## [14] "Razor...unique.peptides.HL_540_CR_RE1_A_02.raw.thermo.raw"
## [15] "Razor...unique.peptides.HL_540_CR_RE1_A_03.raw.thermo.raw"
## [16] "Razor...unique.peptides.HL_540_CR_RE1_B_01.raw.thermo.raw"
## [17] "Razor...unique.peptides.HL_540_CR_RE1_B_02.raw.thermo.raw"
## [18] "Razor...unique.peptides.HL_540_CR_RE1_B_03.raw.thermo.raw"
## [19] "Razor...unique.peptides.HL_540_CR_RE2_A_01.raw.thermo.raw"
## [20] "Razor...unique.peptides.HL_540_CR_RE2_A_02.raw.thermo.raw"
## [21] "Razor...unique.peptides.HL_540_CR_RE2_A_03.raw.thermo.raw"
## [22] "Razor...unique.peptides.HL_540_CR_RE2_B_01.raw.thermo.raw"
## [23] "Razor...unique.peptides.HL_540_CR_RE2_B_02.raw.thermo.raw"
## [24] "Razor...unique.peptides.HL_540_CR_RE2_B_03.raw.thermo.raw"
pep.count.table <- data.frame(
count = rowMins(as.matrix(df.prot[-removed_col_indices, c(71:76, 77:82, 95:100, 101:106)])),
row.names = df.prot$Majority.protein.IDs[-removed_col_indices]
)
pep.count.table$count <- pep.count.table$count + 1
protein.matrix <- as.matrix(df.LFQ.filter)
class <- factor(c(rep("E", 12), rep("RE", 12)))
replicate <- factor(c(rep("rep1", 6), rep("rep2", 6),
rep("rep1", 6), rep("rep2", 6)))
design <- model.matrix(~ 0 + class + replicate)
colnames(design)[1:2] <- c("classE", "classRE")
fit_result <- run_deqms_pipeline(
protein.matrix, design, pep.count.table,
contrast_string = "classE-classRE"
)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 9.9968e-17

DEqMS.results <- outputResult(fit_result, coef_col = 1)
rownames(df.prot) <- df.prot$Majority.protein.IDs
DEqMS.results$Gene.name <- df.prot[DEqMS.results$gene, ]$Gene.names
timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
write.csv(DEqMS.results,
file = paste0("/proj/proteomics/7_combinedHL/evaluation_paper/abc_2025/raw_log2FCdata/hl540_cryo/data_", timestamp, ".csv"))
processed_df <- DEqMS.results %>%
dplyr::filter(!is.na(logFC)) %>%
dplyr::mutate(across(where(is.numeric), ~ round(.x, 3)))
datatable(processed_df, rownames = FALSE)
processed_df_interest <- processed_df %>%
dplyr::filter(Gene.name %in% proteins_of_interest)
datatable(processed_df_interest, rownames = FALSE, caption = "proteins of interest")
vp_lfc_limit <- 5
plot_volcano(DEqMS.results, "E vs RE", vp_lfc_limit)

sig <- DEqMS.results %>% dplyr::filter(sca.P.Value <= 0.05)
n_up <- sum(sig$logFC >= 1.0, na.rm = TRUE)
n_down <- sum(sig$logFC <= -1.0, na.rm = TRUE)
n_total <- sum(abs(sig$logFC) >= 1.0, na.rm = TRUE)
cat("Strongly upregulated (logFC ≥ 1.0):", n_up, "\n")
## Strongly upregulated (logFC ≥ 1.0): 10
cat("Strongly downregulated (logFC ≤ -1.0):", n_down, "\n")
## Strongly downregulated (logFC ≤ -1.0): 11
cat("Total |logFC| ≥ 1.0:", n_total, "\n")
## Total |logFC| ≥ 1.0: 21
R vs RE
############################################################
# R vs RE
############################################################
df <- read.csv("./results_maxquant/hl540_cryo_galaxy.txt", sep = "\t")
df.prot <- df
df.prot <- df.prot[!df.prot$Reverse == "+", ]
df.prot <- df.prot[!df.prot$Potential.contaminant == "+", ]
df.prot <- df.prot[!df.prot$Only.identified.by.site == "+", ]
df.prot$Gene.names <- extract_gene_names(df.prot$Fasta.headers)
df.LFQ <- df.prot[, c(284:289, 290:295, 296:301, 302:307)]
df.LFQ[df.LFQ == 0] <- NA; print(colnames(df.LFQ))
## [1] "LFQ.intensity.HL_540_CR_R1_A_01.raw.thermo.raw"
## [2] "LFQ.intensity.HL_540_CR_R1_A_02.raw.thermo.raw"
## [3] "LFQ.intensity.HL_540_CR_R1_A_03.raw.thermo.raw"
## [4] "LFQ.intensity.HL_540_CR_R1_B_01.raw.thermo.raw"
## [5] "LFQ.intensity.HL_540_CR_R1_B_02.raw.thermo.raw"
## [6] "LFQ.intensity.HL_540_CR_R1_B_03.raw.thermo.raw"
## [7] "LFQ.intensity.HL_540_CR_R2_A_01.raw.thermo.raw"
## [8] "LFQ.intensity.HL_540_CR_R2_A_02.raw.thermo.raw"
## [9] "LFQ.intensity.HL_540_CR_R2_A_03.raw.thermo.raw"
## [10] "LFQ.intensity.HL_540_CR_R2_B_01.raw.thermo.raw"
## [11] "LFQ.intensity.HL_540_CR_R2_B_02.raw.thermo.raw"
## [12] "LFQ.intensity.HL_540_CR_R2_B_03.raw.thermo.raw"
## [13] "LFQ.intensity.HL_540_CR_RE1_A_01.raw.thermo.raw"
## [14] "LFQ.intensity.HL_540_CR_RE1_A_02.raw.thermo.raw"
## [15] "LFQ.intensity.HL_540_CR_RE1_A_03.raw.thermo.raw"
## [16] "LFQ.intensity.HL_540_CR_RE1_B_01.raw.thermo.raw"
## [17] "LFQ.intensity.HL_540_CR_RE1_B_02.raw.thermo.raw"
## [18] "LFQ.intensity.HL_540_CR_RE1_B_03.raw.thermo.raw"
## [19] "LFQ.intensity.HL_540_CR_RE2_A_01.raw.thermo.raw"
## [20] "LFQ.intensity.HL_540_CR_RE2_A_02.raw.thermo.raw"
## [21] "LFQ.intensity.HL_540_CR_RE2_A_03.raw.thermo.raw"
## [22] "LFQ.intensity.HL_540_CR_RE2_B_01.raw.thermo.raw"
## [23] "LFQ.intensity.HL_540_CR_RE2_B_02.raw.thermo.raw"
## [24] "LFQ.intensity.HL_540_CR_RE2_B_03.raw.thermo.raw"
rownames(df.LFQ) <- df.prot$Majority.protein.IDs
df.LFQ <- t(df.LFQ)
keep_cols <- colSums(!is.na(df.LFQ)) > 0
removed_col_indices <- which(!keep_cols)
df.LFQ.clean <- df.LFQ[, colSums(!is.na(df.LFQ)) > 0]
df.LFQ.clean <- log2(df.LFQ.clean)
result <- llsImpute(as.matrix(df.LFQ.clean), k = 10, correlation = "pearson", allVariables = TRUE)
df.LFQ <- as.data.frame(completeObs(result))
df.LFQ <- as.data.frame(t(df.LFQ))
df.LFQ$valid_A <- apply(df.LFQ[, 1:12], 1, function(x) sum(!is.na(x)))
df.LFQ$valid_B <- apply(df.LFQ[, 13:24], 1, function(x) sum(!is.na(x)))
df.LFQ.filter <- df.LFQ[df.LFQ$valid_A >= 2 | df.LFQ$valid_B >= 2, 1:24]
print(colnames(df.prot[, c(83:88, 89:94, 95:100, 101:106)]))
## [1] "Razor...unique.peptides.HL_540_CR_R1_A_01.raw.thermo.raw"
## [2] "Razor...unique.peptides.HL_540_CR_R1_A_02.raw.thermo.raw"
## [3] "Razor...unique.peptides.HL_540_CR_R1_A_03.raw.thermo.raw"
## [4] "Razor...unique.peptides.HL_540_CR_R1_B_01.raw.thermo.raw"
## [5] "Razor...unique.peptides.HL_540_CR_R1_B_02.raw.thermo.raw"
## [6] "Razor...unique.peptides.HL_540_CR_R1_B_03.raw.thermo.raw"
## [7] "Razor...unique.peptides.HL_540_CR_R2_A_01.raw.thermo.raw"
## [8] "Razor...unique.peptides.HL_540_CR_R2_A_02.raw.thermo.raw"
## [9] "Razor...unique.peptides.HL_540_CR_R2_A_03.raw.thermo.raw"
## [10] "Razor...unique.peptides.HL_540_CR_R2_B_01.raw.thermo.raw"
## [11] "Razor...unique.peptides.HL_540_CR_R2_B_02.raw.thermo.raw"
## [12] "Razor...unique.peptides.HL_540_CR_R2_B_03.raw.thermo.raw"
## [13] "Razor...unique.peptides.HL_540_CR_RE1_A_01.raw.thermo.raw"
## [14] "Razor...unique.peptides.HL_540_CR_RE1_A_02.raw.thermo.raw"
## [15] "Razor...unique.peptides.HL_540_CR_RE1_A_03.raw.thermo.raw"
## [16] "Razor...unique.peptides.HL_540_CR_RE1_B_01.raw.thermo.raw"
## [17] "Razor...unique.peptides.HL_540_CR_RE1_B_02.raw.thermo.raw"
## [18] "Razor...unique.peptides.HL_540_CR_RE1_B_03.raw.thermo.raw"
## [19] "Razor...unique.peptides.HL_540_CR_RE2_A_01.raw.thermo.raw"
## [20] "Razor...unique.peptides.HL_540_CR_RE2_A_02.raw.thermo.raw"
## [21] "Razor...unique.peptides.HL_540_CR_RE2_A_03.raw.thermo.raw"
## [22] "Razor...unique.peptides.HL_540_CR_RE2_B_01.raw.thermo.raw"
## [23] "Razor...unique.peptides.HL_540_CR_RE2_B_02.raw.thermo.raw"
## [24] "Razor...unique.peptides.HL_540_CR_RE2_B_03.raw.thermo.raw"
pep.count.table <- data.frame(
count = rowMins(as.matrix(df.prot[-removed_col_indices, c(83:88, 89:94, 95:100, 101:106)])),
row.names = df.prot$Majority.protein.IDs[-removed_col_indices]
)
pep.count.table$count <- pep.count.table$count + 1
protein.matrix <- as.matrix(df.LFQ.filter)
class <- factor(c(rep("E", 12), rep("RE", 12))) # (as in your original block)
replicate <- factor(c(rep("rep1", 6), rep("rep2", 6),
rep("rep1", 6), rep("rep2", 6)))
design <- model.matrix(~ 0 + class + replicate)
colnames(design)[1:2] <- c("classE", "classRE")
fit_result <- run_deqms_pipeline(
protein.matrix, design, pep.count.table,
contrast_string = "classE-classRE"
)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 7.8876e-17

DEqMS.results <- outputResult(fit_result, coef_col = 1)
rownames(df.prot) <- df.prot$Majority.protein.IDs
DEqMS.results$Gene.name <- df.prot[DEqMS.results$gene, ]$Gene.names
timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
write.csv(DEqMS.results,
file = paste0("/proj/proteomics/7_combinedHL/evaluation_paper/abc_2025/raw_log2FCdata/hl540_cryo/data_", timestamp, ".csv"))
processed_df <- DEqMS.results %>%
dplyr::filter(!is.na(logFC)) %>%
dplyr::mutate(across(where(is.numeric), ~ round(.x, 3)))
datatable(processed_df, rownames = FALSE)
processed_df_interest <- processed_df %>%
dplyr::filter(Gene.name %in% proteins_of_interest)
datatable(processed_df_interest, rownames = FALSE, caption = "proteins of interest")
vp_lfc_limit <- 5
plot_volcano(DEqMS.results, "R vs RE", vp_lfc_limit)

sig <- DEqMS.results %>% dplyr::filter(sca.P.Value <= 0.05)
n_up <- sum(sig$logFC >= 1.0, na.rm = TRUE)
n_down <- sum(sig$logFC <= -1.0, na.rm = TRUE)
n_total <- sum(abs(sig$logFC) >= 1.0, na.rm = TRUE)
cat("Strongly upregulated (logFC ≥ 1.0):", n_up, "\n")
## Strongly upregulated (logFC ≥ 1.0): 196
cat("Strongly downregulated (logFC ≤ -1.0):", n_down, "\n")
## Strongly downregulated (logFC ≤ -1.0): 124
cat("Total |logFC| ≥ 1.0:", n_total, "\n")
## Total |logFC| ≥ 1.0: 320